import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',None)
# Reading csv file
df = pd.read_csv('Chennai houseing sale.csv')
df.head(5)
| PRT_ID | AREA | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | SALE_COND | PARK_FACIL | DATE_BUILD | BUILDTYPE | UTILITY_AVAIL | STREET | MZZONE | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | P03210 | Karapakkam | 1004 | 04-05-2011 | 131 | 1.0 | 1.0 | 3 | AbNormal | Yes | 15-05-1967 | Commercial | AllPub | Paved | A | 4.0 | 3.9 | 4.9 | 4.330 | 380000 | 144400 | 7600000 |
| 1 | P09411 | Anna Nagar | 1986 | 19-12-2006 | 26 | 2.0 | 1.0 | 5 | AbNormal | No | 22-12-1995 | Commercial | AllPub | Gravel | RH | 4.9 | 4.2 | 2.5 | 3.765 | 760122 | 304049 | 21717770 |
| 2 | P01812 | Adyar | 909 | 04-02-2012 | 70 | 1.0 | 1.0 | 3 | AbNormal | Yes | 09-02-1992 | Commercial | ELO | Gravel | RL | 4.1 | 3.8 | 2.2 | 3.090 | 421094 | 92114 | 13159200 |
| 3 | P05346 | Velachery | 1855 | 13-03-2010 | 14 | 3.0 | 2.0 | 5 | Family | No | 18-03-1988 | Others | NoSewr | Paved | I | 4.7 | 3.9 | 3.6 | 4.010 | 356321 | 77042 | 9630290 |
| 4 | P06210 | Karapakkam | 1226 | 05-10-2009 | 84 | 1.0 | 1.0 | 3 | AbNormal | Yes | 13-10-1979 | Others | AllPub | Gravel | C | 3.0 | 2.5 | 4.1 | 3.290 | 237000 | 74063 | 7406250 |
# Information about the Data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7109 entries, 0 to 7108 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PRT_ID 7109 non-null object 1 AREA 7109 non-null object 2 INT_SQFT 7109 non-null int64 3 DATE_SALE 7109 non-null object 4 DIST_MAINROAD 7109 non-null int64 5 N_BEDROOM 7108 non-null float64 6 N_BATHROOM 7104 non-null float64 7 N_ROOM 7109 non-null int64 8 SALE_COND 7109 non-null object 9 PARK_FACIL 7109 non-null object 10 DATE_BUILD 7109 non-null object 11 BUILDTYPE 7109 non-null object 12 UTILITY_AVAIL 7109 non-null object 13 STREET 7109 non-null object 14 MZZONE 7109 non-null object 15 QS_ROOMS 7109 non-null float64 16 QS_BATHROOM 7109 non-null float64 17 QS_BEDROOM 7109 non-null float64 18 QS_OVERALL 7061 non-null float64 19 REG_FEE 7109 non-null int64 20 COMMIS 7109 non-null int64 21 SALES_PRICE 7109 non-null int64 dtypes: float64(6), int64(6), object(10) memory usage: 1.2+ MB
# Changing Date Column to Datetime Datatype
df['DATE_SALE'] = pd.to_datetime(df['DATE_SALE'], format= '%d-%m-%Y')
df['DATE_BUILD'] = pd.to_datetime(df['DATE_BUILD'], format= '%d-%m-%Y')
df['PRT_ID'] = df['PRT_ID'].str[1:].astype('int64')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7109 entries, 0 to 7108 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PRT_ID 7109 non-null int64 1 AREA 7109 non-null object 2 INT_SQFT 7109 non-null int64 3 DATE_SALE 7109 non-null datetime64[ns] 4 DIST_MAINROAD 7109 non-null int64 5 N_BEDROOM 7108 non-null float64 6 N_BATHROOM 7104 non-null float64 7 N_ROOM 7109 non-null int64 8 SALE_COND 7109 non-null object 9 PARK_FACIL 7109 non-null object 10 DATE_BUILD 7109 non-null datetime64[ns] 11 BUILDTYPE 7109 non-null object 12 UTILITY_AVAIL 7109 non-null object 13 STREET 7109 non-null object 14 MZZONE 7109 non-null object 15 QS_ROOMS 7109 non-null float64 16 QS_BATHROOM 7109 non-null float64 17 QS_BEDROOM 7109 non-null float64 18 QS_OVERALL 7061 non-null float64 19 REG_FEE 7109 non-null int64 20 COMMIS 7109 non-null int64 21 SALES_PRICE 7109 non-null int64 dtypes: datetime64[ns](2), float64(6), int64(7), object(7) memory usage: 1.2+ MB
df.head(2)
| PRT_ID | AREA | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | SALE_COND | PARK_FACIL | DATE_BUILD | BUILDTYPE | UTILITY_AVAIL | STREET | MZZONE | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3210 | Karapakkam | 1004 | 2011-05-04 | 131 | 1.0 | 1.0 | 3 | AbNormal | Yes | 1967-05-15 | Commercial | AllPub | Paved | A | 4.0 | 3.9 | 4.9 | 4.330 | 380000 | 144400 | 7600000 |
| 1 | 9411 | Anna Nagar | 1986 | 2006-12-19 | 26 | 2.0 | 1.0 | 5 | AbNormal | No | 1995-12-22 | Commercial | AllPub | Gravel | RH | 4.9 | 4.2 | 2.5 | 3.765 | 760122 | 304049 | 21717770 |
# Size of the data
df.shape
(7109, 22)
# Checking for Null Values
df.isnull().sum()
PRT_ID 0 AREA 0 INT_SQFT 0 DATE_SALE 0 DIST_MAINROAD 0 N_BEDROOM 1 N_BATHROOM 5 N_ROOM 0 SALE_COND 0 PARK_FACIL 0 DATE_BUILD 0 BUILDTYPE 0 UTILITY_AVAIL 0 STREET 0 MZZONE 0 QS_ROOMS 0 QS_BATHROOM 0 QS_BEDROOM 0 QS_OVERALL 48 REG_FEE 0 COMMIS 0 SALES_PRICE 0 dtype: int64
# Replacing the Null Values in 'QS_OVERALL' column by the Average of 'QS_ROOMS', 'QS_BATHROOM', 'QS_BEDROOM'.
df['QS_OVERALL'] = df['QS_OVERALL'].fillna(round(df[['QS_ROOMS', 'QS_BATHROOM', 'QS_BEDROOM']].mean(axis=1), 2))
# Droping Null Values
df.dropna(inplace=True)
# Checking if Null Values are gone
df.isnull().sum()
PRT_ID 0 AREA 0 INT_SQFT 0 DATE_SALE 0 DIST_MAINROAD 0 N_BEDROOM 0 N_BATHROOM 0 N_ROOM 0 SALE_COND 0 PARK_FACIL 0 DATE_BUILD 0 BUILDTYPE 0 UTILITY_AVAIL 0 STREET 0 MZZONE 0 QS_ROOMS 0 QS_BATHROOM 0 QS_BEDROOM 0 QS_OVERALL 0 REG_FEE 0 COMMIS 0 SALES_PRICE 0 dtype: int64
df.shape
(7103, 22)
# Changing datatype to int
df['N_BEDROOM'] = df['N_BEDROOM'].fillna(0).astype('int64')
df['N_BATHROOM'] = df['N_BATHROOM'].fillna(0).astype('int64')
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 7103 entries, 0 to 7108 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PRT_ID 7103 non-null int64 1 AREA 7103 non-null object 2 INT_SQFT 7103 non-null int64 3 DATE_SALE 7103 non-null datetime64[ns] 4 DIST_MAINROAD 7103 non-null int64 5 N_BEDROOM 7103 non-null int64 6 N_BATHROOM 7103 non-null int64 7 N_ROOM 7103 non-null int64 8 SALE_COND 7103 non-null object 9 PARK_FACIL 7103 non-null object 10 DATE_BUILD 7103 non-null datetime64[ns] 11 BUILDTYPE 7103 non-null object 12 UTILITY_AVAIL 7103 non-null object 13 STREET 7103 non-null object 14 MZZONE 7103 non-null object 15 QS_ROOMS 7103 non-null float64 16 QS_BATHROOM 7103 non-null float64 17 QS_BEDROOM 7103 non-null float64 18 QS_OVERALL 7103 non-null float64 19 REG_FEE 7103 non-null int64 20 COMMIS 7103 non-null int64 21 SALES_PRICE 7103 non-null int64 dtypes: datetime64[ns](2), float64(4), int64(9), object(7) memory usage: 1.2+ MB
# lets see the unique values of columns
for cols in df.columns:
if df[cols].dtype == object:
print()
print(cols)
print(df[cols].unique())
AREA ['Karapakkam' 'Anna Nagar' 'Adyar' 'Velachery' 'Chrompet' 'KK Nagar' 'TNagar' 'T Nagar' 'Chrompt' 'Chrmpet' 'Karapakam' 'Ana Nagar' 'Chormpet' 'Adyr' 'Velchery' 'Ann Nagar' 'KKNagar'] SALE_COND ['AbNormal' 'Family' 'Partial' 'AdjLand' 'Normal Sale' 'Ab Normal' 'Partiall' 'Adj Land' 'PartiaLl'] PARK_FACIL ['Yes' 'No' 'Noo'] BUILDTYPE ['Commercial' 'Others' 'Other' 'House' 'Comercial'] UTILITY_AVAIL ['AllPub' 'ELO' 'NoSewr ' 'NoSeWa' 'All Pub'] STREET ['Paved' 'Gravel' 'No Access' 'Pavd' 'NoAccess'] MZZONE ['A' 'RH' 'RL' 'I' 'C' 'RM']
There seems to be some misspells in the Dataset¶
# Cleaning the Data
df.AREA.replace(['Ana Nagar','Ann Nagar'],'Anna Nagar',inplace=True)
df.AREA.replace('Karapakam','Karapakkam',inplace=True)
df.AREA.replace(['Chrompt','Chrmpet','Chormpet','Chrompet'],'Chromepet',inplace=True)
df.AREA.replace('KKNagar','KK Nagar',inplace=True)
df.AREA.replace('TNagar','T Nagar',inplace=True)
df.AREA.replace('Adyr','Adyar',inplace=True)
df.AREA.replace('Velchery','Velachery',inplace=True)
df.BUILDTYPE.replace('Comercial','Commercial',inplace=True)
df.BUILDTYPE.replace('Other','Others',inplace=True)
df.UTILITY_AVAIL.replace('All Pub','AllPub',inplace=True)
df.SALE_COND.replace('Ab Normal','AbNormal',inplace=True)
df.SALE_COND.replace(['PartiaLl','Partiall'],'Partial',inplace=True)
df.SALE_COND.replace('Adj Land','AdjLand',inplace=True)
df.PARK_FACIL.replace('Noo','No',inplace=True)
df.STREET.replace('Pavd','Paved',inplace=True)
df.STREET.replace('NoAccess','No Access',inplace=True)
# Checking if the changes has been applied
for cols in df.columns:
if df[cols].dtype == object:
print()
print(cols)
print(df[cols].unique())
AREA ['Karapakkam' 'Anna Nagar' 'Adyar' 'Velachery' 'Chromepet' 'KK Nagar' 'T Nagar'] SALE_COND ['AbNormal' 'Family' 'Partial' 'AdjLand' 'Normal Sale'] PARK_FACIL ['Yes' 'No'] BUILDTYPE ['Commercial' 'Others' 'House'] UTILITY_AVAIL ['AllPub' 'ELO' 'NoSewr ' 'NoSeWa'] STREET ['Paved' 'Gravel' 'No Access'] MZZONE ['A' 'RH' 'RL' 'I' 'C' 'RM']
df.head(3)
| PRT_ID | AREA | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | SALE_COND | PARK_FACIL | DATE_BUILD | BUILDTYPE | UTILITY_AVAIL | STREET | MZZONE | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3210 | Karapakkam | 1004 | 2011-05-04 | 131 | 1 | 1 | 3 | AbNormal | Yes | 1967-05-15 | Commercial | AllPub | Paved | A | 4.0 | 3.9 | 4.9 | 4.330 | 380000 | 144400 | 7600000 |
| 1 | 9411 | Anna Nagar | 1986 | 2006-12-19 | 26 | 2 | 1 | 5 | AbNormal | No | 1995-12-22 | Commercial | AllPub | Gravel | RH | 4.9 | 4.2 | 2.5 | 3.765 | 760122 | 304049 | 21717770 |
| 2 | 1812 | Adyar | 909 | 2012-02-04 | 70 | 1 | 1 | 3 | AbNormal | Yes | 1992-02-09 | Commercial | ELO | Gravel | RL | 4.1 | 3.8 | 2.2 | 3.090 | 421094 | 92114 | 13159200 |
The Dataset is Cleaned and Ready for Analysis¶
# Stastical Description of Data
df.describe()
| PRT_ID | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | DATE_BUILD | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7103.000000 | 7103.000000 | 7103 | 7103.000000 | 7103.000000 | 7103.00000 | 7103.000000 | 7103 | 7103.000000 | 7103.000000 | 7103.000000 | 7103.000000 | 7103.000000 | 7103.000000 | 7.103000e+03 |
| mean | 5034.853865 | 1382.117556 | 2010-04-06 11:12:03.243699968 | 99.578629 | 1.637336 | 1.21329 | 3.688723 | 1985-12-28 20:14:21.607771392 | 3.517753 | 3.507377 | 3.485513 | 3.504131 | 376952.707588 | 141055.364212 | 1.089477e+07 |
| min | 1.000000 | 500.000000 | 2004-01-16 00:00:00 | 0.000000 | 1.000000 | 1.00000 | 2.000000 | 1949-10-28 00:00:00 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 71177.000000 | 5055.000000 | 2.156875e+06 |
| 25% | 2536.500000 | 993.000000 | 2008-10-17 00:00:00 | 50.000000 | 1.000000 | 1.00000 | 3.000000 | 1976-08-20 12:00:00 | 2.700000 | 2.700000 | 2.700000 | 3.130000 | 272469.000000 | 84252.500000 | 8.270900e+06 |
| 50% | 5085.000000 | 1373.000000 | 2010-05-03 00:00:00 | 99.000000 | 1.000000 | 1.00000 | 4.000000 | 1986-10-13 00:00:00 | 3.500000 | 3.500000 | 3.500000 | 3.500000 | 349509.000000 | 127644.000000 | 1.033464e+07 |
| 75% | 7513.500000 | 1744.000000 | 2011-09-15 00:00:00 | 148.000000 | 2.000000 | 1.00000 | 4.000000 | 1996-07-01 00:00:00 | 4.300000 | 4.300000 | 4.300000 | 3.890000 | 451544.500000 | 184534.000000 | 1.299422e+07 |
| max | 10034.000000 | 2500.000000 | 2015-02-17 00:00:00 | 200.000000 | 4.000000 | 2.00000 | 6.000000 | 2010-11-12 00:00:00 | 5.000000 | 5.000000 | 5.000000 | 4.970000 | 983922.000000 | 495405.000000 | 2.366734e+07 |
| std | 2885.722305 | 457.484517 | NaN | 57.399230 | 0.803030 | 0.40966 | 1.019299 | NaN | 0.891931 | 0.898042 | 0.887067 | 0.527168 | 143088.109006 | 78775.875689 | 3.769549e+06 |
df.AREA.value_counts()
AREA Chromepet 1699 Karapakkam 1366 KK Nagar 997 Velachery 981 Anna Nagar 785 Adyar 774 T Nagar 501 Name: count, dtype: int64
# Distribution of Houses in various Areas
px.pie(df.groupby('AREA',as_index=False)['PRT_ID'].count(), values='PRT_ID', names='AREA',
title='<b> Number of houses in various areas of Chennai</b>', labels={'PRT_ID':'Count'},
color_discrete_sequence=px.colors.sequential.Plasma, hole=.5)
Insights :¶
- Chrompet has the highest number of houses among all areas in Chennai.
- T Nagar has a lower number of houses compared to other areas in Chennai.
px.box(df, x='AREA', y='SALES_PRICE', title= '<b>Sales Price of houses in various areas of Chennai',
color= 'AREA')
Insights :¶
- T Nagar and Anna Nagar areas have the highest selling prices for houses, while KK Nagar comes in Third.
- Houses in Karapakam area have lower selling prices compared to houses in other areas.
px.scatter(df,x= 'INT_SQFT', y= 'SALES_PRICE', color= 'AREA', size= 'INT_SQFT',
title= '<b> SQFT Versus SALES PRICE of Houses in Various Areas', labels= {'INT_SQFT': 'SQFT'})
Insights :¶
- An increase in the square footage of houses is associated with an increase in the sales price of houses.
- Houses in the same area tend to have similar square footage.
- Houses in T Nagar and Anna Nagar areas have the highest prices, with square footage typically ranging from 1500 to 2000.
- KK Nagar area houses have Higher square foot, typically ranging from 1400 to 2500.
px.scatter(df, x='DIST_MAINROAD', y='SALES_PRICE', color='AREA', size='SALES_PRICE',
title='<b> MAINROAD DISTANCE Versus SALES PRICE of Houses in Various Areas',
labels= {'DIST_MAINROAD':'Distance from Mainroad'})
Insights :¶
- It appears that the distance to the main road does not significantly affect the sales price of houses.
- Houses with both shorter and longer distances to the main road have similar prices, indicating that the main road distance does not have a strong impact on the sales price.
fig=px.scatter(df, x='DATE_BUILD', y='SALES_PRICE', color='AREA', size='SALES_PRICE',
title='<b> Build Date Vs Sales Price')
fig.show()
fig=px.scatter(df, x='DATE_SALE', y='SALES_PRICE', color='AREA', size='SALES_PRICE',
title='<b> Sale Date Vs Sales Price')
fig.show()
Insights :¶
- The columns 'DATE_BUILD' and 'DATE_SALE' do not appear to have a significant impact on the 'SALES_PRICE' of properties.
- This suggests that the dates when the properties were built and sold do not directly influence their selling prices.
px.box(df, x='N_ROOM', y='SALES_PRICE', color='AREA', labels= {'N_ROOM':'No.of Rooms'},
title='<b> Total Rooms Versus Sales Price of Houses in Various Areas')
Insights:¶
There is a positive correlation between the number of rooms ('N_ROOM') and the sales price ('SALES_PRICE'). As the number of rooms increases, the sales price also tends to increase.
The majority of houses in the dataset have 4 to 5 rooms, indicating that this range is common among the properties.
Only KK Nagar has houses with 6 rooms. This aligns with the previous insight showing that KK Nagar has the highest square footage houses, suggesting that larger houses with more rooms are more common in KK Nagar.
Anna Nagar and T Nagar have houses predominantly with 4 to 5 rooms. Despite this, they have higher sales prices, indicating that these areas are desirable and command higher prices despite having fewer properties with more rooms.
px.box(df, x='N_BEDROOM', y='SALES_PRICE', color='AREA', labels= {'N_BEDROOM':'No.of Bedrooms'},
title='<b>Total BedRooms Versus Sales Price of Houses in Various Areas')
Insights:¶
There is a positive correlation between the number of bedrooms ('N_BEDROOM') and the sales price ('SALES_PRICE'). As the number of bedrooms increases, the sales price also tends to increase.
While an increase in the number of bedrooms tends to increase the sales price, the sales are directly affected by the area where the property is located.
The majority of houses in the dataset have 1 to 2 bedrooms, indicating that this range is common among the properties.
KK Nagar and Velachery are the only areas with 3-bedroom houses, but KK Nagar also has 4-bedroom houses, indicating a wider range of housing options in KK Nagar.
Anna Nagar and T Nagar have houses predominantly with 1 to 2 bedrooms. Despite this, they have higher sales prices, suggesting that the area itself has a direct effect on the sales price, regardless of the number of bedrooms in the properties.
px.box(df,x='N_BATHROOM', y='SALES_PRICE', color='AREA', labels= {'N_BATHROOM':'No.of Bathrooms'},
title='<b>Total BathRooms Versus Sales Price of Houses in Various Areas')
Insights:¶
There is a positive correlation between the number of bathrooms ('N_BATHROOM') and the sales price ('SALES_PRICE'). As the number of bathrooms increases, the sales price also tends to increase.
More than half of the houses in the dataset have 1 bathroom, indicating that this is the most common number of bathrooms among the properties.
Anna Nagar and T Nagar have houses with only a single bathroom. Despite this, they have higher sales prices, suggesting that these areas are desirable and command higher prices despite having fewer bathrooms in the properties.
px.box(df, x='SALE_COND', y='SALES_PRICE', color='AREA', labels= {'SALE_COND':'Sale Condition'},
title='<b>Sale Condition Versus Sales Price of Houses in various Areas')
Insights:¶
- There doesn't seem to be much difference in sales price based on sale condition.
- The sale condition doesn't appear to have a significant impact on the sales price of properties.
px.box(df, x='PARK_FACIL', y='SALES_PRICE', color='AREA', labels= {'PARK_FACIL':'Parking Facility'},
title='<b>Parking Facility Versus Sales Price of Houses in Different Areas')
Insight:¶
- Houses with a parking facility tend to have slightly higher prices across different areas.
- This suggests that having a parking facility is a desirable feature that can positively impact the sales price of a house.
px.box(df, x='BUILDTYPE', y='SALES_PRICE', color='AREA',
title='<b>Build Type Versus Sales Price of Houses in Different Areas')
Insights:¶
- Commercial houses appear to have significantly higher prices compared to residential houses and other property types in different areas.
px.box(df, x='UTILITY_AVAIL', y='SALES_PRICE', color='AREA',
title='<b> Utility Available Versus Sales Price of Houses in Different Areas')
Insight:¶
- Across different utility availability types there isn't a significant difference in sales prices.
- This suggests that in this dataset utility availability may not be a major factor affecting property sales prices.
px.box(df, x='STREET', y='SALES_PRICE', color='AREA',
title='<b> Street Versus Sales Price of Houses in Different Areas')
Insight:¶
- There isn't much difference in sales price between properties located on a Paved street and those on a Gravel street.
- Properties with no street access have lower sales prices compared to properties on both Paved and Gravel streets.
px.box(df, x='MZZONE', y='SALES_PRICE', color='AREA',
title='<b> MZZONE Versus Sales Price of Houses in Different Areas')
Insight:¶
Houses in RM (Residential Medium Density) zones tend to have the highest sales prices among all zones, followed by RL (Residential Low Density) and RH (Residential High Density) zones.
Houses in A (Agricultural) zones tend to have lower sales prices compared to all, typically indicates areas designated for agricultural use.
The areas 'Anna Nagar', 'Chromepet', 'KK Nagar', and 'T Nagar' only have houses in residential zones, indicating that these areas are primarily residential in nature.
numeric_columns = ['INT_SQFT', 'DIST_MAINROAD', 'N_BEDROOM', 'N_BATHROOM', 'N_ROOM', 'QS_ROOMS', 'QS_BATHROOM', 'QS_BEDROOM', 'QS_OVERALL', 'REG_FEE', 'COMMIS', 'SALES_PRICE']
numeric_df = df[numeric_columns]
corr_matrix = numeric_df.corr()
numeric_df.corr()
| INT_SQFT | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INT_SQFT | 1.000000 | 0.001973 | 0.786392 | 0.515433 | 0.951271 | 0.019720 | -0.008168 | 0.008385 | 0.012163 | 0.657435 | 0.571125 | 0.612026 |
| DIST_MAINROAD | 0.001973 | 1.000000 | -0.002644 | 0.002074 | 0.002136 | 0.001762 | -0.029529 | 0.001890 | -0.016771 | 0.011608 | 0.011218 | 0.018679 |
| N_BEDROOM | 0.786392 | -0.002644 | 1.000000 | 0.755214 | 0.840357 | 0.014750 | -0.007692 | 0.015335 | 0.012541 | 0.455301 | 0.430224 | 0.330933 |
| N_BATHROOM | 0.515433 | 0.002074 | 0.755214 | 1.000000 | 0.568725 | 0.013219 | -0.012124 | 0.013076 | 0.007565 | 0.260299 | 0.256510 | 0.108884 |
| N_ROOM | 0.951271 | 0.002136 | 0.840357 | 0.568725 | 1.000000 | 0.016410 | -0.007428 | 0.014618 | 0.014053 | 0.630801 | 0.533423 | 0.602637 |
| QS_ROOMS | 0.019720 | 0.001762 | 0.014750 | 0.013219 | 0.016410 | 1.000000 | 0.008809 | 0.008284 | 0.517387 | 0.019772 | 0.008976 | 0.021960 |
| QS_BATHROOM | -0.008168 | -0.029529 | -0.007692 | -0.012124 | -0.007428 | 0.008809 | 1.000000 | -0.011646 | 0.551877 | -0.006705 | -0.000585 | -0.011361 |
| QS_BEDROOM | 0.008385 | 0.001890 | 0.015335 | 0.013076 | 0.014618 | 0.008284 | -0.011646 | 1.000000 | 0.630074 | 0.020903 | 0.019928 | 0.018430 |
| QS_OVERALL | 0.012163 | -0.016771 | 0.012541 | 0.007565 | 0.014053 | 0.517387 | 0.551877 | 0.630074 | 1.000000 | 0.021101 | 0.016716 | 0.019786 |
| REG_FEE | 0.657435 | 0.011608 | 0.455301 | 0.260299 | 0.630801 | 0.019772 | -0.006705 | 0.020903 | 0.021101 | 1.000000 | 0.660004 | 0.878126 |
| COMMIS | 0.571125 | 0.011218 | 0.430224 | 0.256510 | 0.533423 | 0.008976 | -0.000585 | 0.019928 | 0.016716 | 0.660004 | 1.000000 | 0.626413 |
| SALES_PRICE | 0.612026 | 0.018679 | 0.330933 | 0.108884 | 0.602637 | 0.021960 | -0.011361 | 0.018430 | 0.019786 | 0.878126 | 0.626413 | 1.000000 |
plt.figure(figsize=(12,6))
sns.heatmap(numeric_df.corr(),annot=True,cmap='GnBu')
plt.title('Correlation of Numeric Column')
plt.show()
df.head(5)
| PRT_ID | AREA | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | SALE_COND | PARK_FACIL | DATE_BUILD | BUILDTYPE | UTILITY_AVAIL | STREET | MZZONE | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3210 | Karapakkam | 1004 | 2011-05-04 | 131 | 1 | 1 | 3 | AbNormal | Yes | 1967-05-15 | Commercial | AllPub | Paved | A | 4.0 | 3.9 | 4.9 | 4.330 | 380000 | 144400 | 7600000 |
| 1 | 9411 | Anna Nagar | 1986 | 2006-12-19 | 26 | 2 | 1 | 5 | AbNormal | No | 1995-12-22 | Commercial | AllPub | Gravel | RH | 4.9 | 4.2 | 2.5 | 3.765 | 760122 | 304049 | 21717770 |
| 2 | 1812 | Adyar | 909 | 2012-02-04 | 70 | 1 | 1 | 3 | AbNormal | Yes | 1992-02-09 | Commercial | ELO | Gravel | RL | 4.1 | 3.8 | 2.2 | 3.090 | 421094 | 92114 | 13159200 |
| 3 | 5346 | Velachery | 1855 | 2010-03-13 | 14 | 3 | 2 | 5 | Family | No | 1988-03-18 | Others | NoSewr | Paved | I | 4.7 | 3.9 | 3.6 | 4.010 | 356321 | 77042 | 9630290 |
| 4 | 6210 | Karapakkam | 1226 | 2009-10-05 | 84 | 1 | 1 | 3 | AbNormal | Yes | 1979-10-13 | Others | AllPub | Gravel | C | 3.0 | 2.5 | 4.1 | 3.290 | 237000 | 74063 | 7406250 |
# Droping Unnecessary Columns based on our insights
df.drop(['DATE_SALE', 'DATE_BUILD', 'DIST_MAINROAD', 'SALE_COND','UTILITY_AVAIL', 'STREET',
'QS_ROOMS', 'QS_BATHROOM', 'QS_BEDROOM', 'QS_OVERALL', 'REG_FEE', 'COMMIS'], axis=1, inplace=True)
df
| PRT_ID | AREA | INT_SQFT | N_BEDROOM | N_BATHROOM | N_ROOM | PARK_FACIL | BUILDTYPE | MZZONE | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3210 | Karapakkam | 1004 | 1 | 1 | 3 | Yes | Commercial | A | 7600000 |
| 1 | 9411 | Anna Nagar | 1986 | 2 | 1 | 5 | No | Commercial | RH | 21717770 |
| 2 | 1812 | Adyar | 909 | 1 | 1 | 3 | Yes | Commercial | RL | 13159200 |
| 3 | 5346 | Velachery | 1855 | 3 | 2 | 5 | No | Others | I | 9630290 |
| 4 | 6210 | Karapakkam | 1226 | 1 | 1 | 3 | Yes | Others | C | 7406250 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7104 | 3834 | Karapakkam | 598 | 1 | 1 | 2 | No | Others | RM | 5353000 |
| 7105 | 10000 | Velachery | 1897 | 3 | 2 | 5 | Yes | Others | RH | 10818480 |
| 7106 | 9594 | Velachery | 1614 | 2 | 1 | 4 | No | House | I | 8351410 |
| 7107 | 6508 | Karapakkam | 787 | 1 | 1 | 2 | Yes | Commercial | RL | 8507000 |
| 7108 | 9794 | Velachery | 1896 | 3 | 2 | 5 | Yes | Others | I | 9976480 |
7103 rows × 10 columns
# Creating a backup file
df_bk=df.copy()
df_bk
| PRT_ID | AREA | INT_SQFT | N_BEDROOM | N_BATHROOM | N_ROOM | PARK_FACIL | BUILDTYPE | MZZONE | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3210 | Karapakkam | 1004 | 1 | 1 | 3 | Yes | Commercial | A | 7600000 |
| 1 | 9411 | Anna Nagar | 1986 | 2 | 1 | 5 | No | Commercial | RH | 21717770 |
| 2 | 1812 | Adyar | 909 | 1 | 1 | 3 | Yes | Commercial | RL | 13159200 |
| 3 | 5346 | Velachery | 1855 | 3 | 2 | 5 | No | Others | I | 9630290 |
| 4 | 6210 | Karapakkam | 1226 | 1 | 1 | 3 | Yes | Others | C | 7406250 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7104 | 3834 | Karapakkam | 598 | 1 | 1 | 2 | No | Others | RM | 5353000 |
| 7105 | 10000 | Velachery | 1897 | 3 | 2 | 5 | Yes | Others | RH | 10818480 |
| 7106 | 9594 | Velachery | 1614 | 2 | 1 | 4 | No | House | I | 8351410 |
| 7107 | 6508 | Karapakkam | 787 | 1 | 1 | 2 | Yes | Commercial | RL | 8507000 |
| 7108 | 9794 | Velachery | 1896 | 3 | 2 | 5 | Yes | Others | I | 9976480 |
7103 rows × 10 columns
Coverting the labels into a numeric form using Label Encoder¶
from sklearn.preprocessing import LabelEncoder
encoders = {}
original_categories = {}
# Iterate over each column
for col in df.select_dtypes(include='object').columns:
# Create a LabelEncoder object
encoders[col] = LabelEncoder()
# Fit and transform the data for each column
df[col] = encoders[col].fit_transform(df[col])
# Store the original categories
original_categories[col] = encoders[col].classes_
# Print original categories and their corresponding encoded values
print(f"Column: {col}")
for category, encoded_value in zip(original_categories[col], encoders[col].transform(original_categories[col])):
print(f" Original Category: {category} Encoded Value: {encoded_value}")
Column: AREA Original Category: Adyar Encoded Value: 0 Original Category: Anna Nagar Encoded Value: 1 Original Category: Chromepet Encoded Value: 2 Original Category: KK Nagar Encoded Value: 3 Original Category: Karapakkam Encoded Value: 4 Original Category: T Nagar Encoded Value: 5 Original Category: Velachery Encoded Value: 6 Column: PARK_FACIL Original Category: No Encoded Value: 0 Original Category: Yes Encoded Value: 1 Column: BUILDTYPE Original Category: Commercial Encoded Value: 0 Original Category: House Encoded Value: 1 Original Category: Others Encoded Value: 2 Column: MZZONE Original Category: A Encoded Value: 0 Original Category: C Encoded Value: 1 Original Category: I Encoded Value: 2 Original Category: RH Encoded Value: 3 Original Category: RL Encoded Value: 4 Original Category: RM Encoded Value: 5
df.head()
| PRT_ID | AREA | INT_SQFT | N_BEDROOM | N_BATHROOM | N_ROOM | PARK_FACIL | BUILDTYPE | MZZONE | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3210 | 4 | 1004 | 1 | 1 | 3 | 1 | 0 | 0 | 7600000 |
| 1 | 9411 | 1 | 1986 | 2 | 1 | 5 | 0 | 0 | 3 | 21717770 |
| 2 | 1812 | 0 | 909 | 1 | 1 | 3 | 1 | 0 | 4 | 13159200 |
| 3 | 5346 | 6 | 1855 | 3 | 2 | 5 | 0 | 2 | 2 | 9630290 |
| 4 | 6210 | 4 | 1226 | 1 | 1 | 3 | 1 | 2 | 1 | 7406250 |
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 7103 entries, 0 to 7108 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PRT_ID 7103 non-null int64 1 AREA 7103 non-null int32 2 INT_SQFT 7103 non-null int64 3 N_BEDROOM 7103 non-null int64 4 N_BATHROOM 7103 non-null int64 5 N_ROOM 7103 non-null int64 6 PARK_FACIL 7103 non-null int32 7 BUILDTYPE 7103 non-null int32 8 MZZONE 7103 non-null int32 9 SALES_PRICE 7103 non-null int64 dtypes: int32(4), int64(6) memory usage: 499.4 KB
# storing the Dependent Variables in X and Independent Variable in Y
x=df.drop('SALES_PRICE',axis=1)
y=df['SALES_PRICE']
# Splitting the Data into Training set and Testing Set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape
((4972, 9), (2131, 9), (4972,), (2131,))
# Scaling the values to convert the int values to Machine Languages
from sklearn.preprocessing import MinMaxScaler
mmscaler=MinMaxScaler(feature_range=(0,1))
x_train=mmscaler.fit_transform(x_train)
x_test=mmscaler.fit_transform(x_test)
x_train=pd.DataFrame(x_train)
x_test=pd.DataFrame(x_test)
# Creating a Dataframe to store the metrics score
a={'Model Name':[], 'Mean_Absolute_Error MAE':[] ,'Mean_Absolute_Percentage_Error MAPE':[] ,'Mean_Squared_Error MSE':[],'Root_Mean_Squared_Error RMSE':[] ,'Root_Mean_Squared_Log_Error RMSLE':[] ,'R2 score':[],'Adj_R_Square':[]}
Results=pd.DataFrame(a)
Results.head()
| Model Name | Mean_Absolute_Error MAE | Mean_Absolute_Percentage_Error MAPE | Mean_Squared_Error MSE | Root_Mean_Squared_Error RMSE | Root_Mean_Squared_Log_Error RMSLE | R2 score | Adj_R_Square |
|---|
# Build the Regression / Regressor models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Create objects of Regression / Regressor models with default hyper-parameters
modelmlg = LinearRegression()
modeldcr = DecisionTreeRegressor()
modelrfr = RandomForestRegressor()
modelSVR = SVR()
modelXGR = xgb.XGBRegressor()
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelETR = ExtraTreesRegressor()
modelGBR = GradientBoostingRegressor()
MM = [modelmlg, modeldcr, modelrfr, modelKNN, modelETR, modelGBR, modelXGR]
for models in MM:
# Fit the model with train data
models.fit(x_train, y_train)
# Predict the model with test data
y_pred = models.predict(x_test)
# Print the model name
print('Model Name: ', models)
# Evaluation metrics for Regression analysis
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
from sklearn import metrics
# Assuming y_true are the actual values and y_pred are the predicted values
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.log(rmse)
r_squared = r2_score(y_test, y_pred)
# Adjusted R-squared
n = len(y_test)
p = x.shape[1] # Number of features
adj_r_squared = 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Root Mean Squared Log Error (RMSLE): {rmsle}")
print(f"R-squared (R^2): {r_squared}")
print(f"Adjusted R-squared: {adj_r_squared}")
print('------------------------------------------------------------------------------------------------------------')
#-------------------------------------------------------------------------------------------
new_row = pd.DataFrame({'Model Name': [str(models)],
'Mean_Absolute_Error MAE': [metrics.mean_absolute_error(y_test, y_pred)],
'Mean_Absolute_Percentage_Error MAPE': [np.mean(np.abs((y_test - y_pred) / y_test)) * 100],
'Mean_Squared_Error MSE': [metrics.mean_squared_error(y_test, y_pred)],
'Root_Mean_Squared_Error RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_pred))],
'Root_Mean_Squared_Log_Error RMSLE': [np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))],
'R2 score': [metrics.r2_score(y_test, y_pred)],
'Adj_R_Square': [adj_r_squared]})
# Append new_row to Results
Results = pd.concat([Results, new_row], ignore_index=True)
Model Name: LinearRegression()
Mean Absolute Error (MAE): 1425145.8457698394
Mean Absolute Percentage Error (MAPE): 14.474119138246595
Mean Squared Error (MSE): 3036548275910.5264
Root Mean Squared Error (RMSE): 1742569.4465101028
Root Mean Squared Log Error (RMSLE): 14.370871275347811
R-squared (R^2): 0.7865146189458103
Adjusted R-squared: 0.7856087403840528
------------------------------------------------------------------------------------------------------------
Model Name: DecisionTreeRegressor()
Mean Absolute Error (MAE): 696385.4856874706
Mean Absolute Percentage Error (MAPE): 7.363178860245062
Mean Squared Error (MSE): 794892032546.6682
Root Mean Squared Error (RMSE): 891567.1778092036
Root Mean Squared Log Error (RMSLE): 13.700736067130293
R-squared (R^2): 0.9441148919609125
Adjusted R-squared: 0.9438777557174651
------------------------------------------------------------------------------------------------------------
Model Name: RandomForestRegressor()
Mean Absolute Error (MAE): 530397.0632801501
Mean Absolute Percentage Error (MAPE): 5.670921520716918
Mean Squared Error (MSE): 441589767563.65186
Root Mean Squared Error (RMSE): 664522.2099852283
Root Mean Squared Log Error (RMSLE): 13.406823580181438
R-squared (R^2): 0.9689539071234792
Adjusted R-squared: 0.9688221698128292
------------------------------------------------------------------------------------------------------------
Model Name: KNeighborsRegressor()
Mean Absolute Error (MAE): 598438.4739558892
Mean Absolute Percentage Error (MAPE): 6.2610300515341635
Mean Squared Error (MSE): 561350972685.7156
Root Mean Squared Error (RMSE): 749233.5902011573
Root Mean Squared Log Error (RMSLE): 13.526806083305642
R-squared (R^2): 0.9605340618953225
Adjusted R-squared: 0.9603665968114271
------------------------------------------------------------------------------------------------------------
Model Name: ExtraTreesRegressor()
Mean Absolute Error (MAE): 541174.539183482
Mean Absolute Percentage Error (MAPE): 5.797972934424993
Mean Squared Error (MSE): 452729270159.7874
Root Mean Squared Error (RMSE): 672851.5959405814
Root Mean Squared Log Error (RMSLE): 13.419280073078268
R-squared (R^2): 0.9681707412586859
Adjusted R-squared: 0.968035680754833
------------------------------------------------------------------------------------------------------------
Model Name: GradientBoostingRegressor()
Mean Absolute Error (MAE): 519218.5429586458
Mean Absolute Percentage Error (MAPE): 5.435603510333493
Mean Squared Error (MSE): 407033906383.53424
Root Mean Squared Error (RMSE): 637992.0895932287
Root Mean Squared Log Error (RMSLE): 13.366081163493368
R-squared (R^2): 0.9713833666681265
Adjusted R-squared: 0.9712619382381469
------------------------------------------------------------------------------------------------------------
Model Name: XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)
Mean Absolute Error (MAE): 522382.30091506336
Mean Absolute Percentage Error (MAPE): 5.533325394683955
Mean Squared Error (MSE): 422838835481.29785
Root Mean Squared Error (RMSE): 650260.5904414767
Root Mean Squared Log Error (RMSLE): 13.385128469901034
R-squared (R^2): 0.9702721966802367
Adjusted R-squared: 0.9701460532432362
------------------------------------------------------------------------------------------------------------
Results
| Model Name | Mean_Absolute_Error MAE | Mean_Absolute_Percentage_Error MAPE | Mean_Squared_Error MSE | Root_Mean_Squared_Error RMSE | Root_Mean_Squared_Log_Error RMSLE | R2 score | Adj_R_Square | |
|---|---|---|---|---|---|---|---|---|
| 0 | LinearRegression() | 1.425146e+06 | 14.474119 | 3.036548e+12 | 1.742569e+06 | 14.370871 | 0.786515 | 0.785609 |
| 1 | DecisionTreeRegressor() | 6.963855e+05 | 7.363179 | 7.948920e+11 | 8.915672e+05 | 13.700736 | 0.944115 | 0.943878 |
| 2 | RandomForestRegressor() | 5.303971e+05 | 5.670922 | 4.415898e+11 | 6.645222e+05 | 13.406824 | 0.968954 | 0.968822 |
| 3 | KNeighborsRegressor() | 5.984385e+05 | 6.261030 | 5.613510e+11 | 7.492336e+05 | 13.526806 | 0.960534 | 0.960367 |
| 4 | ExtraTreesRegressor() | 5.411745e+05 | 5.797973 | 4.527293e+11 | 6.728516e+05 | 13.419280 | 0.968171 | 0.968036 |
| 5 | GradientBoostingRegressor() | 5.192185e+05 | 5.435604 | 4.070339e+11 | 6.379921e+05 | 13.366081 | 0.971383 | 0.971262 |
| 6 | XGBRegressor(base_score=None, booster=None, ca... | 5.223823e+05 | 5.533325 | 4.228388e+11 | 6.502606e+05 | 13.385128 | 0.970272 | 0.970146 |
From the above Results, the Top 3 Models by Comparing Adjacent R Square Values are¶
- XGBRegressor
- GradientBoostingRegressor
- ExtraTreesRegressor
Training and Predicting with XGBRegressor
# Training the Model
modelXGR.fit(x_train, y_train)
# Predict the model with test data
y_pred = modelXGR.predict(x_test)
out=pd.DataFrame({'Price_actual':y_test,'Price_pred':y_pred})
result=df_bk.merge(out,left_index=True,right_index=True)
result
| PRT_ID | AREA | INT_SQFT | N_BEDROOM | N_BATHROOM | N_ROOM | PARK_FACIL | BUILDTYPE | MZZONE | SALES_PRICE | Price_actual | Price_pred | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8 | 3377 | Chromepet | 771 | 1 | 1 | 2 | No | Others | RM | 8308970 | 8308970 | 8097472.0 |
| 14 | 4085 | Velachery | 1865 | 3 | 2 | 5 | No | Commercial | RM | 15499680 | 15499680 | 16066729.0 |
| 15 | 6328 | Velachery | 1868 | 3 | 2 | 5 | No | Commercial | RH | 15714080 | 15714080 | 15024930.0 |
| 17 | 2016 | Chromepet | 796 | 1 | 1 | 2 | Yes | Commercial | RL | 10912550 | 10912550 | 11018991.0 |
| 19 | 1372 | Anna Nagar | 1902 | 2 | 1 | 5 | Yes | Commercial | RL | 21203240 | 21203240 | 21672568.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7098 | 629 | Anna Nagar | 1611 | 1 | 1 | 4 | Yes | Others | RM | 13944780 | 13944780 | 14169602.0 |
| 7100 | 5438 | T Nagar | 1733 | 1 | 1 | 4 | Yes | Commercial | RL | 19501600 | 19501600 | 20383738.0 |
| 7102 | 5560 | Karapakkam | 701 | 1 | 1 | 2 | No | House | RH | 5643500 | 5643500 | 5284015.0 |
| 7103 | 5133 | Karapakkam | 1462 | 2 | 2 | 4 | No | Others | RM | 9387250 | 9387250 | 8950791.0 |
| 7104 | 3834 | Karapakkam | 598 | 1 | 1 | 2 | No | Others | RM | 5353000 | 5353000 | 6375991.5 |
2131 rows × 12 columns
result[['PRT_ID','AREA','Price_actual','Price_pred']].sample(20)
| PRT_ID | AREA | Price_actual | Price_pred | |
|---|---|---|---|---|
| 6568 | 9292 | Chromepet | 10001300 | 10726214.0 |
| 3856 | 8382 | Velachery | 13822920 | 13961900.0 |
| 3317 | 958 | Chromepet | 10847700 | 10797060.0 |
| 6199 | 6250 | Chromepet | 9544150 | 9912693.0 |
| 3361 | 8711 | Chromepet | 8111250 | 7270345.0 |
| 6342 | 5109 | Karapakkam | 5874250 | 5773031.0 |
| 3528 | 6073 | Karapakkam | 6434750 | 6807893.0 |
| 2628 | 7164 | Anna Nagar | 19147580 | 18473892.0 |
| 5712 | 5187 | Adyar | 8557760 | 8687759.0 |
| 5905 | 9668 | Velachery | 12498160 | 12269813.0 |
| 293 | 4821 | Karapakkam | 7849250 | 7778017.5 |
| 1182 | 8028 | Adyar | 9310300 | 8240811.5 |
| 4957 | 8500 | Anna Nagar | 15324610 | 14858333.0 |
| 5650 | 4893 | T Nagar | 13081220 | 13594749.0 |
| 1695 | 7086 | T Nagar | 20747530 | 22015104.0 |
| 4048 | 9502 | Chromepet | 9847570 | 9511261.0 |
| 5048 | 3936 | KK Nagar | 14373860 | 14100759.0 |
| 4844 | 5229 | Chromepet | 9688320 | 10237972.0 |
| 3425 | 8582 | Adyar | 7811430 | 8169637.0 |
| 4488 | 3333 | KK Nagar | 11131800 | 11805751.0 |
px.scatter(result, x='Price_actual', y='Price_pred', trendline='ols', color_discrete_sequence=['magenta'],
template='plotly_dark', title='<b> Actual Price Vs Predicted Price ')